#delimit;
clear;
capture log close;
set more off;

local in "/data";

/* Combining those who did and did not attend school*/
use "`in'/nsc_2016.dta", clear;

gen new=1;

replace Tuedt_2003=subinstr(Tuedt_2003,"_","",.);
rename Tuedt_2003 TUEDT2003;

drop Search_Date study_ID;

count if Graduated=="N" & Degree_Title~="";
/*0*/

count if Graduated=="N" & Degree_Major1~="";
/*0*/

count if Graduated=="Y" & Degree_Major4~="";
/*5*/
count if Graduated=="Y" & Degree_Major3~="";
/*18*/
count if Graduated=="Y" & Degree_Major2~="";
/*577*/
count if Graduated=="Y" & Degree_Major1~="";
/*19,830*/
count if Graduated=="Y" & Degree_CIP1~="";
/*12,508*/
count if Graduated=="Y" & Degree_CIP1~="" & Degree_Major1=="";
/*0*/


/*
COL_year24 now is Year_2yr_4yr
*/

append using "`in'/nsc_old.dta";
replace new=0 if new==.;

drop if Record_Found=="N";

duplicates drop;

rename TUEDT2003 pid;
gen College_Code_Branch6=substr(College_Code_Branch,1,6);

sort pid Enrollment_Begin Enrollment_End Graduation_Date College_Code_Branch6 Enrollment_Status Degree_Title new;
order new;
gen todrop=1 if new==0 & new[_n+1]==1 & pid==pid[_n+1] & Enrollment_Begin==Enrollment_Begin[_n+1] & Enrollment_End==Enrollment_End[_n+1] & Graduation_Date==Graduation_Date[_n+1] & College_Code_Branch6==College_Code_Branch6[_n+1];
drop if todrop==1;

gen oldt=1 if new==0;
by pid: egen old=max(oldt);
drop oldt;

by pid: egen newnew=max(new);

/*The new dataset is more complete. The only cases where this is not the case
is when old==1 and newnew==0 | pid=="CQ370100000417"*/
drop if new==0 & newnew==1 & pid~="CQ370100000417";

replace Year_2yr_4yr=COL_year24 if new==0;
drop COL_year24;
rename Year_2yr_4yr COL_year24;

drop todrop old newnew College_Code_Branch6;
replace Degree_Major1=Major if Major~="";
drop Major new;

dis _N; /*258,044*/
count if Degree_Major4~=""; /*5*/
count if Degree_Major3~=""; /*18*/
count if Degree_Major2~=""; /*576*/ 

/*Now I express those with more than one major in one line (this is, Degree_Major1~="" & Degree_Major2~="", for example), in different lines, so we can apply the old code over one variable -- Major-- rather than over 4*/
expand 2 if Degree_Major4~="", generate(newobs);

gen Major=Degree_Major1 if newobs==0;
gen Major_CIP=Degree_CIP1 if newobs==0;
sort pid Degree_Major4 newobs;
replace Major=Degree_Major4 if Major=="" & newobs==1;
replace Major_CIP=Degree_CIP4 if Major_CIP=="" & newobs==1;

expand 2 if Degree_Major3~="" & newobs==0, generate(newobs1);
replace newobs=1 if newobs==0 & newobs1==1;

replace Major="" if newobs1==1;
replace Major_CIP="" if newobs1==1;
 
sort pid Degree_Major3 newobs1;
replace Major=Degree_Major3 if Major=="" & newobs1==1;
replace Major_CIP=Degree_CIP3 if Major_CIP=="" & newobs1==1;


expand 2 if Degree_Major2~="" & newobs==0, generate(newobs2);
replace newobs=1 if newobs==0 & newobs2==1;

replace Major="" if newobs2==1;
replace Major_CIP="" if newobs2==1;
 
sort pid Degree_Major2 newobs2;
replace Major=Degree_Major2 if Major=="" & newobs2==1;
replace Major_CIP=Degree_CIP2 if Major_CIP=="" & newobs2==1;

drop newobs newobs1 newobs2;

dis _N;

rename pid TUEDT2003;

save "`in'/nsc_cleaned_forsas.dta", replace;

